#function to resize plots later
fig <- function(width, heigth){options(repr.plot.width = width, repr.plot.height = heigth)}

Wrangling the Data

conn <- dbConnect(RSQLite::SQLite(), "../spotify_analysis/charts.sqlite")
df <- tbl(conn, "top200") %>% collect()
dbDisconnect(conn)
df <- df %>% 
  separate_rows(artist, sep = ", ") %>%
  mutate(date = ymd(date), trend = factor(trend))
summary(df)
##     title                rank            date               artist         
##  Length:447644      Min.   :  1.0   Min.   :2017-01-01   Length:447644     
##  Class :character   1st Qu.: 51.0   1st Qu.:2018-02-20   Class :character  
##  Mode  :character   Median :100.0   Median :2019-03-26   Mode  :character  
##                     Mean   :100.2   Mean   :2019-03-23                     
##                     3rd Qu.:150.0   3rd Qu.:2020-04-19                     
##                     Max.   :200.0   Max.   :2021-06-15                     
##      url                      trend           streams        
##  Length:447644      MOVE_DOWN    :192850   Min.   :  325951  
##  Class :character   MOVE_UP      :176767   1st Qu.:  704685  
##  Mode  :character   NEW_ENTRY    : 19818   Median :  900287  
##                     SAME_POSITION: 58209   Mean   : 1183243  
##                                            3rd Qu.: 1357400  
##                                            Max.   :17223237
#check for null values

df %>% select(artist) %>% is.na() %>% any()
## [1] FALSE

Apparently there are no NULL Values in the dataset

df %>%
  select(artist) %>%
  arrange(artist) %>%
  unique() %>%
  head()
## # A tibble: 6 x 1
##   artist     
##   <chr>      
## 1 ""         
## 2 "-"        
## 3 "$NOT"     
## 4 "(G)I-DLE" 
## 5 "*NSYNC"   
## 6 "070 Shake"

Here we can see that we have no Null values but “-” and ” “. So we need to delete them from the file. Let’s have a look at the wrongful entries:

df %>% filter(artist %in% c('-', ''))
## # A tibble: 20 x 7
##    title            rank date       artist url                     trend streams
##    <chr>           <int> <date>     <chr>  <chr>                   <fct>   <int>
##  1 "MLK Interlude"   148 2021-03-20 "-"    https://open.spotify.c~ MOVE~  886362
##  2 "MLK Interlude"    70 2021-03-19 "-"    https://open.spotify.c~ NEW_~ 1426117
##  3 ""                156 2017-12-01 ""     https://open.spotify.c~ NEW_~  650496
##  4 ""                181 2017-12-01 ""     https://open.spotify.c~ NEW_~  588738
##  5 ""                 30 2017-11-14 ""     https://open.spotify.c~ MOVE~ 1581132
##  6 ""                 31 2017-11-13 ""     https://open.spotify.c~ MOVE~ 1473457
##  7 ""                 27 2017-11-12 ""     https://open.spotify.c~ MOVE~ 1544175
##  8 ""                 26 2017-11-11 ""     https://open.spotify.c~ MOVE~ 1762307
##  9 ""                 30 2017-11-10 ""     https://open.spotify.c~ MOVE~ 1660185
## 10 ""                 32 2017-11-09 ""     https://open.spotify.c~ MOVE~ 1496980
## 11 ""                 89 2017-07-23 ""     https://open.spotify.c~ MOVE~  690247
## 12 ""                101 2017-07-22 ""     https://open.spotify.c~ NEW_~  747893
## 13 ""                  5 2017-07-21 ""     https://open.spotify.c~ SAME~ 3653533
## 14 ""                 16 2017-07-21 ""     https://open.spotify.c~ MOVE~ 2522453
## 15 ""                 35 2017-07-21 ""     https://open.spotify.c~ MOVE~ 1798890
## 16 ""                 48 2017-07-21 ""     https://open.spotify.c~ MOVE~ 1526955
## 17 ""                  5 2017-07-20 ""     https://open.spotify.c~ NEW_~ 3568811
## 18 ""                 13 2017-07-20 ""     https://open.spotify.c~ NEW_~ 2571960
## 19 ""                 30 2017-07-20 ""     https://open.spotify.c~ NEW_~ 1798208
## 20 ""                 39 2017-07-20 ""     https://open.spotify.c~ NEW_~ 1520291

We get a bunch of entries which we have to delete from the Dataset

df <- df %>%  filter(!(artist %in% c('-', '')))

Checking if it worked:

df %>%
  select(artist) %>%
  arrange(artist) %>%
  unique() %>%
  head()
## # A tibble: 6 x 1
##   artist           
##   <chr>            
## 1 $NOT             
## 2 (G)I-DLE         
## 3 *NSYNC           
## 4 070 Shake        
## 5 13 Organisé      
## 6 187 Strassenbande

Looks good :-) No more empty entries

Starting to explore the data

Which artists are in the Top 200?

df %>% 
  select(artist) %>% 
  distinct() %>% 
  nrow()
## [1] 1594

So there are 1594 artists in the Top 200 Dataset.

Let’s explore which of these artists appeared the most often in the Top 200

df %>% 
  select(artist) %>%
  group_by(artist) %>% 
  count() %>% 
  arrange(desc(n))
## # A tibble: 1,594 x 2
## # Groups:   artist [1,594]
##    artist            n
##    <chr>         <int>
##  1 Ed Sheeran     9576
##  2 Post Malone    9542
##  3 J Balvin       9421
##  4 Bad Bunny      8471
##  5 Drake          7293
##  6 Ozuna          7040
##  7 XXXTENTACION   6785
##  8 Billie Eilish  6582
##  9 Travis Scott   5586
## 10 Ariana Grande  5282
## # ... with 1,584 more rows

Ed Sheeran, Post Malone and J Balvin sit comfortably on top of the list. But it’s a close race between them. But who got the most streams?

num_streams <- df %>%  
  select(artist, streams) %>% 
  group_by(artist) %>% 
  summarise(num_streams= sum(streams)) %>% 
  arrange(desc(num_streams))

num_streams %>% head(10)
## # A tibble: 10 x 2
##    artist        num_streams
##    <chr>               <dbl>
##  1 Post Malone   13976842359
##  2 Bad Bunny     11835167604
##  3 J Balvin      11695364249
##  4 Ed Sheeran    11475793196
##  5 Drake          9934735017
##  6 Billie Eilish  8674898986
##  7 Ariana Grande  8271296865
##  8 Ozuna          7713621696
##  9 XXXTENTACION   7063878971
## 10 Dua Lipa       6833617322

Seems like Bad Bunny has some very popular songs. Now we look into the number of songs each artist has in the Dataset. Do the artists with the most streams also have to most tracks?

num_tracks <- df %>% 
  select(artist, title) %>%
  unique() %>% 
  group_by(artist) %>% 
  count(name = "num_tracks") %>% 
  arrange(desc(num_tracks))

num_tracks %>% head(10)
## # A tibble: 10 x 2
## # Groups:   artist [10]
##    artist       num_tracks
##    <chr>             <int>
##  1 Drake               101
##  2 Future               99
##  3 Taylor Swift         96
##  4 Bad Bunny            90
##  5 BTS                  78
##  6 Juice WRLD           73
##  7 Lil Uzi Vert         71
##  8 Eminem               66
##  9 J Balvin             66
## 10 Logic                62
df_join <- num_streams %>% inner_join(num_tracks)
## Joining, by = "artist"
head(df_join)
## # A tibble: 6 x 3
##   artist        num_streams num_tracks
##   <chr>               <dbl>      <int>
## 1 Post Malone   13976842359         46
## 2 Bad Bunny     11835167604         90
## 3 J Balvin      11695364249         66
## 4 Ed Sheeran    11475793196         50
## 5 Drake          9934735017        101
## 6 Billie Eilish  8674898986         29
ggplot(df_join, aes(df_join$num_streams, df_join$num_tracks)) + geom_point() + geom_smooth(method = lm) + labs(x = "Stream Count", y = "Track Count", title = "Correlation Track and Stream Count")
## Warning: Use of `df_join$num_streams` is discouraged. Use `num_streams` instead.
## Warning: Use of `df_join$num_tracks` is discouraged. Use `num_tracks` instead.
## Warning: Use of `df_join$num_streams` is discouraged. Use `num_streams` instead.
## Warning: Use of `df_join$num_tracks` is discouraged. Use `num_tracks` instead.
## `geom_smooth()` using formula 'y ~ x'

cor.test(df_join$num_streams, df_join$num_tracks)
## 
##  Pearson's product-moment correlation
## 
## data:  df_join$num_streams and df_join$num_tracks
## t = 43.189, df = 1592, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7110645 0.7563409
## sample estimates:
##       cor 
## 0.7345191

Technically there is a high correlation. But if we look at the graph, we can see that that’s probably the case because of the high density around 0. There is no linear trend visible. We can observe a high correlation between the number of songs and the number of streams.

Which artist had the most tracks in the Top200 at one single day?

df %>% 
  select(artist, date) %>% 
  group_by(date) %>% 
  count(artist) %>% 
  arrange(desc(n)) %>% 
  head(10)
## # A tibble: 10 x 3
## # Groups:   date [10]
##    date       artist           n
##    <date>     <chr>        <int>
##  1 2018-06-19 XXXTENTACION    37
##  2 2017-07-21 Linkin Park     34
##  3 2017-03-19 Drake           30
##  4 2017-03-20 Drake           29
##  5 2017-03-21 Drake           29
##  6 2017-03-22 Drake           29
##  7 2017-03-23 Drake           29
##  8 2018-06-29 Drake           29
##  9 2020-03-13 Lil Uzi Vert    29
## 10 2018-06-20 XXXTENTACION    28

XXXTENTACION had 37 songs at once in the Top 200 on the 19th June 2018. He got killed on the 18th June 2018 which probably explains this. Linkin Park had 34 Songs at once in the Top 200 on the 21th July 2017. One day before that their vocalist Chester Bennington commited suicide. Music unites people in bad (but also in good!) times…

Which track had the most streams in one day?

df %>% 
  select(-c(url, trend)) %>%
  arrange(desc(streams)) %>%
  head(20)
## # A tibble: 20 x 5
##    title                                        rank date       artist   streams
##    <chr>                                       <int> <date>     <chr>      <int>
##  1 All I Want for Christmas Is You                 1 2020-12-24 Mariah ~  1.72e7
##  2 Last Christmas                                  2 2020-12-24 Wham!     1.58e7
##  3 drivers license                                 1 2021-01-15 Olivia ~  1.37e7
##  4 drivers license                                 1 2021-01-14 Olivia ~  1.37e7
##  5 drivers license                                 1 2021-01-13 Olivia ~  1.36e7
##  6 All I Want for Christmas Is You                 1 2020-12-25 Mariah ~  1.36e7
##  7 drivers license                                 1 2021-01-12 Olivia ~  1.34e7
##  8 good 4 u                                        1 2021-05-25 Olivia ~  1.26e7
##  9 good 4 u                                        1 2021-05-26 Olivia ~  1.24e7
## 10 Last Christmas                                  2 2020-12-25 Wham!     1.24e7
## 11 good 4 u                                        1 2021-05-27 Olivia ~  1.23e7
## 12 Santa Tell Me                                   3 2020-12-24 Ariana ~  1.22e7
## 13 good 4 u                                        1 2021-05-24 Olivia ~  1.22e7
## 14 good 4 u                                        1 2021-05-21 Olivia ~  1.22e7
## 15 drivers license                                 1 2021-01-16 Olivia ~  1.21e7
## 16 All I Want for Christmas Is You                 1 2019-12-24 Mariah ~  1.20e7
## 17 It's Beginning to Look a Lot like Christmas     4 2020-12-24 Michael~  1.20e7
## 18 Jingle Bell Rock                                5 2020-12-24 Bobby H~  1.19e7
## 19 Rockin' Around The Christmas Tree               6 2020-12-24 Brenda ~  1.18e7
## 20 good 4 u                                        1 2021-05-28 Olivia ~  1.18e7

It makes sense that most of those songs are popular christmas hits! But also Olivia Rodrigo seems to be very popular Let’s have a closer look at her:

df %>%
  arrange(desc(streams)) %>% 
  head(50) %>% 
  filter(artist == "Olivia Rodrigo") %>% 
  count()
## # A tibble: 1 x 1
##       n
##   <int>
## 1    28

Let’s see what place she gets in the 50 most streams songs:

df %>%
  arrange(desc(streams))%>%
  head(50) %>%
  group_by(artist) %>% 
  count() %>% 
  arrange(desc(n))
## # A tibble: 14 x 2
## # Groups:   artist [14]
##    artist             n
##    <chr>          <int>
##  1 Olivia Rodrigo    28
##  2 Mariah Carey       4
##  3 Wham!              3
##  4 Ariana Grande      2
##  5 Bobby Helms        2
##  6 Brenda Lee         2
##  7 Michael Bublé      2
##  8 Andy Williams      1
##  9 Bad Bunny          1
## 10 BTS                1
## 11 Ed Sheeran         1
## 12 Jhay Cortez        1
## 13 Luis Fonsi         1
## 14 XXXTENTACION       1

Crazy, more than 50% of the 50 most streamed songs are from Olivia Rodrigo!

df %>% 
  filter(artist == "Olivia Rodrigo") %>% 
  arrange(date) %>% 
  head(1)
## # A tibble: 1 x 7
##   title                               rank date       artist url   trend streams
##   <chr>                              <int> <date>     <chr>  <chr> <fct>   <int>
## 1 "All I Want - From \"High School ~   192 2020-01-06 Olivi~ http~ NEW_~  642819

The first time she appeared on the charts was 2020-01-06. Apparently she got famous through high school musical. Let’s see how she did from there.

df %>% 
  filter(artist == "Olivia Rodrigo") %>% 
  ggplot(aes(y = rank, x = date, color = title)) + geom_line(show.legend = FALSE) + scale_y_reverse() + theme_minimal()

So we see that she uninterrupted stayed in the charts since her first song. The longest with her first song. The songs after that stayed shorter in top ranks.

Her total number of streams:

df %>% 
  filter(artist == "Olivia Rodrigo") %>% 
  summarise(streams_total = sum(streams))
## # A tibble: 1 x 1
##   streams_total
##           <int>
## 1    2087769350

Her total number of songs:

df %>%
  filter(artist == "Olivia Rodrigo") %>% 
  select(title) %>% 
  unique() %>% 
  count()
## # A tibble: 1 x 1
##       n
##   <int>
## 1    12

How do the streams distribute over her songs?

df %>% 
  filter(artist == "Olivia Rodrigo") %>% 
  group_by(title) %>% 
  summarise(total_streams = sum(streams)) %>% 
  arrange(desc(total_streams))
## # A tibble: 12 x 2
##    title                                                           total_streams
##    <chr>                                                                   <int>
##  1 "drivers license"                                                   813710554
##  2 "good 4 u"                                                          317055897
##  3 "deja vu"                                                           297460035
##  4 "traitor"                                                           106119949
##  5 "happier"                                                            86403906
##  6 "brutal"                                                             82455487
##  7 "favorite crime"                                                     82363485
##  8 "enough for you"                                                     71417140
##  9 "jealousy, jealousy"                                                 64540563
## 10 "1 step forward, 3 steps back"                                       62314847
## 11 "All I Want - From \"High School Musical: The Musical: The Ser~      53199370
## 12 "hope ur ok"                                                         50728117

Percentage distribution:

df %>%
  filter(artist == "Olivia Rodrigo") %>%
  group_by(title) %>%
  summarise(total_streams_percent = sum(streams)) %>%
  mutate(total_streams_percent = total_streams_percent / sum(total_streams_percent) * 100) %>%
  arrange(desc(total_streams_percent))
## # A tibble: 12 x 2
##    title                                                        total_streams_p~
##    <chr>                                                                   <dbl>
##  1 "drivers license"                                                       39.0 
##  2 "good 4 u"                                                              15.2 
##  3 "deja vu"                                                               14.2 
##  4 "traitor"                                                                5.08
##  5 "happier"                                                                4.14
##  6 "brutal"                                                                 3.95
##  7 "favorite crime"                                                         3.95
##  8 "enough for you"                                                         3.42
##  9 "jealousy, jealousy"                                                     3.09
## 10 "1 step forward, 3 steps back"                                           2.98
## 11 "All I Want - From \"High School Musical: The Musical: The ~             2.55
## 12 "hope ur ok"                                                             2.43
df %>%  
  filter(artist == "Olivia Rodrigo") %>% 
  select(title, streams) %>% 
  group_by(title) %>% 
  ggplot(aes(x = title, y = streams, fill = title)) + geom_col(show.legend = FALSE) + theme_minimal()

Well, this does not look very clean… Let’s try out the plotly Library

library(plotly)
## Warning: package 'plotly' was built under R version 4.0.5
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot <- df %>%  
  filter(artist == "Olivia Rodrigo") %>%
  group_by(title) %>%
  summarise(total_streams_percent = sum(streams)) %>%
  mutate(total_streams_percent = total_streams_percent / sum(total_streams_percent) * 100) %>%
  ggplot(aes(x = title, y = total_streams_percent, fill = title)) +
  geom_col() +
  theme_minimal() +
  theme(axis.text.x = element_blank(), legend.position = "none")+
  labs(title = "%-Distribution of Olivia Rodrigo streams", x = "Song", y = "%-Streams")

ggplotly(plot)

The Song “Drivers License” is the most popular

df %>% 
  filter(rank == 1) %>% 
  select(artist) %>% 
  n_distinct
## [1] 47

Which artist got on place No. 1 the most times?

df %>% 
  filter(rank == 1) %>% 
  select(artist) %>% 
  group_by(artist) %>% 
  count() %>% 
  arrange(desc(n)) %>% 
  head()
## # A tibble: 6 x 2
## # Groups:   artist [6]
##   artist            n
##   <chr>         <int>
## 1 Post Malone     174
## 2 Drake           163
## 3 Ed Sheeran      139
## 4 Ariana Grande   133
## 5 Tones And I     120
## 6 21 Savage       114

Which title was No. 1 for the longest time?

df %>%
  filter(rank == 1) %>%
  group_by(title) %>%  
  summarise(days_on_1 = n_distinct(date)) %>% 
  arrange(desc(days_on_1))
## # A tibble: 66 x 2
##    title             days_on_1
##    <chr>                 <int>
##  1 Dance Monkey            120
##  2 rockstar                114
##  3 Señorita                102
##  4 Shape of You             97
##  5 Despacito - Remix        96
##  6 Blinding Lights          82
##  7 God's Plan               74
##  8 7 rings                  68
##  9 drivers license          67
## 10 DÁKITI                   60
## # ... with 56 more rows

Stream distribution among No. 1 artists in percent

df %>%  
  filter(rank == 1) %>% 
  group_by(artist) %>%
  summarise(total_streams_percent = sum(streams)) %>%
  mutate(total_streams_percent = total_streams_percent / sum(total_streams_percent)*100) %>% 
  arrange(desc(total_streams_percent))
## # A tibble: 47 x 2
##    artist         total_streams_percent
##    <chr>                          <dbl>
##  1 Drake                           8.12
##  2 Ed Sheeran                      7.06
##  3 Post Malone                     6.66
##  4 Ariana Grande                   6.32
##  5 Olivia Rodrigo                  5.83
##  6 Camila Cabello                  5.76
##  7 Tones And I                     5.71
##  8 Shawn Mendes                    5.47
##  9 Justin Bieber                   5.36
## 10 Luis Fonsi                      4.63
## # ... with 37 more rows

Number of No.1 tracks per artist

df %>%
  filter(rank == 1) %>% 
  group_by(artist) %>% 
  summarise(no1_tracks = n_distinct(title)) %>% 
  arrange(desc(no1_tracks)) %>% 
  head(10)
## # A tibble: 10 x 2
##    artist         no1_tracks
##    <chr>               <int>
##  1 Drake                   7
##  2 Post Malone             6
##  3 Ariana Grande           4
##  4 Billie Eilish           4
##  5 Taylor Swift            4
##  6 The Weeknd              3
##  7 Bad Bunny               2
##  8 Camila Cabello          2
##  9 Daddy Yankee            2
## 10 Ed Sheeran              2

Since Drake is one of my favorite artists, I will have a closer look at him.

How Many songs does he have in the charts?

df_drake <- df %>% 
  filter(artist == "Drake")
df_drake %>% 
  select(artist, title) %>% 
  unique() %>% 
  count()
## # A tibble: 1 x 1
##       n
##   <int>
## 1   101

Wow, Drake had 101 Songs in the charts. As we already know, 7 of them got to No. 1. But which songs are those?

df_drake %>% 
  filter(rank == 1) %>% 
  select(title) %>% 
  unique()
## # A tibble: 7 x 1
##   title             
##   <chr>             
## 1 What’s Next       
## 2 Toosie Slide      
## 3 In My Feelings    
## 4 Don’t Matter To Me
## 5 Nonstop           
## 6 Nice For What     
## 7 God's Plan

Which of those songs was the first to reach No.1?

df_drake %>% 
  filter(rank == 1) %>% 
  group_by(title) %>% 
  summarise( date = min(date)) %>% 
  arrange(date)
## # A tibble: 7 x 2
##   title              date      
##   <chr>              <date>    
## 1 God's Plan         2018-01-21
## 2 Nice For What      2018-04-09
## 3 Nonstop            2018-06-29
## 4 Don’t Matter To Me 2018-07-02
## 5 In My Feelings     2018-07-08
## 6 Toosie Slide       2020-04-10
## 7 What’s Next        2021-03-05

God’s Plan was Drakes first No.1 Hit. It reached No.1 on 21.01.2018.

Visualizing his No.1 songs

plot2 <- df %>% 
  filter(artist == "Drake",
         title %in% (
           df %>% filter(artist == "Drake", rank == 1) %>% 
             pull(title)
          )
        ) %>% 
  ggplot(aes(x = date, y = rank, color= title)) +
  geom_line(show.legend = FALSE) +
  theme_minimal() +
  scale_y_reverse()

ggplotly(plot2)

His first No.1 Hit “God’s Plan” was also the one that stayed in the charts the longest

Which of his songs was in the charts the longest?

drake_days <- df_drake %>%  
  group_by(title) %>% 
  summarise(days = n_distinct(date)) %>% 
  arrange(desc(days))

head(drake_days, 10)
## # A tibble: 10 x 2
##    title                                     days
##    <chr>                                    <int>
##  1 God's Plan                                 717
##  2 One Dance                                  674
##  3 Passionfruit                               396
##  4 Money In The Grave (Drake ft. Rick Ross)   378
##  5 In My Feelings                             364
##  6 Toosie Slide                               325
##  7 Nonstop                                    306
##  8 Laugh Now Cry Later (feat. Lil Durk)       302
##  9 Nice For What                              298
## 10 Fake Love                                  257
summary(drake_days)
##     title                days      
##  Length:101         Min.   :  1.0  
##  Class :character   1st Qu.:  7.0  
##  Mode  :character   Median : 22.0  
##                     Mean   : 72.2  
##                     3rd Qu.: 81.0  
##                     Max.   :717.0

Creating an overview of his songs and their ranks

df %>% 
  group_by(title) %>% 
  summarise(max_rank = min(rank), mid = median(rank), min_rank = max(rank)) %>% 
  arrange(mid) %>% 
  inner_join(drake_days)
## Joining, by = "title"
## # A tibble: 101 x 5
##    title                                max_rank   mid min_rank  days
##    <chr>                                   <int> <dbl>    <int> <int>
##  1 Don’t Matter To Me                          1   2         10    21
##  2 I'm Upset                                   6  46        200   118
##  3 Toosie Slide                                1  56        200   325
##  4 Emotionless                                 3  59        185    34
##  5 8 Out Of 10                                 6  59.5      197    22
##  6 Laugh Now Cry Later (feat. Lil Durk)        2  61        200   302
##  7 Nonstop                                     1  62        199   306
##  8 Survival                                    2  62        184    22
##  9 In My Feelings                              1  63        200   364
## 10 Pain 1993 (with Playboi Carti)              4  63        119    21
## # ... with 91 more rows